# Library for handling data
import pandas as pd
# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier
# Library for UMAP projections
from umap.umap_ import UMAP
# Library for visualization
import plotly.express as px
df = pd.read_csv('pbta_miRactivity_results.csv')
df
| Kids_First_Biospecimen_ID | sample_id | aliquot_id | RNA_library | short_histology | molecular_subtype | age_at_diagnosis_days | reported_gender | MIMAT0010195 | MIMAT0004481 | ... | MIMAT0004986 | MIMAT0004987 | MIMAT0000094 | MIMAT0026473 | MIMAT0004510 | MIMAT0000095 | MIMAT0022842 | MIMAT0000097 | MIMAT0004678 | MIMAT0000689 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BS_GXTFW99H | 7316-2151 | 740006 | stranded | HGAT | HGG, H3 wildtype | 2166 | Male | 0.203215 | 0.170353 | ... | 0.163825 | 0.141134 | 0.192549 | 0.153315 | 0.147092 | 0.299865 | 0.134360 | 0.044772 | 0.167571 | 0.105896 |
| 1 | BS_HF3RNN2T | 7316-2155 | 564491 | stranded | Medulloblastoma | MB, Group4 | 3606 | Female | 0.183178 | 0.246241 | ... | 0.153730 | 0.147374 | 0.322979 | 0.119636 | 0.139716 | 0.185569 | 0.173047 | 0.180138 | 0.172517 | 0.151552 |
| 2 | BS_MMNRZ1EQ | 7316-2141 | 571446 | stranded | ATRT | NaN | 383 | Male | 0.056398 | -0.019984 | ... | -0.130431 | -0.140412 | -0.111823 | -0.140318 | -0.142191 | -0.041198 | -0.148094 | -0.080920 | -0.018398 | 0.060411 |
| 3 | BS_53GQKVEX | 7316-487 | 588341 | stranded | LGAT | LGG, KIAA1549-BRAF | 553 | Male | 0.169361 | 0.223773 | ... | 0.145929 | 0.145370 | 0.298415 | 0.121007 | 0.138993 | 0.214281 | 0.166954 | 0.166200 | 0.155085 | 0.144268 |
| 4 | BS_JS95PE0J | 7316-1774 | 574549 | stranded | HGAT | HGG, H3 wildtype, TP53 loss | 2431 | Male | 0.132491 | 0.170572 | ... | 0.144303 | 0.113045 | 0.219452 | 0.115481 | 0.124098 | 0.231750 | 0.119749 | -0.039174 | 0.145273 | 0.000353 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | BS_RCFQ31XF | 7316-873 | 570111 | stranded | Ependymoma | EPN, To be classified | 442 | Male | 0.157141 | 0.125819 | ... | 0.086446 | 0.070290 | 0.126832 | 0.068954 | 0.074025 | 0.174126 | 0.074397 | 0.061975 | 0.110112 | 0.136656 |
| 240 | BS_QXKRN6CR | 7316-111 | 470423 | stranded | Ependymoma | EPN, ST RELA | 4519 | Female | 0.146616 | 0.118695 | ... | 0.045809 | 0.035370 | 0.069957 | 0.042077 | 0.038712 | 0.148805 | 0.029867 | 0.066198 | 0.084429 | 0.123514 |
| 241 | BS_4FZS7TX4 | 7316-160 | 549587 | stranded | Ependymoma | EPN, To be classified | 1223 | Male | 0.194594 | 0.116444 | ... | 0.070706 | 0.075477 | 0.076592 | 0.069451 | 0.078228 | 0.165159 | 0.073962 | 0.113423 | 0.113134 | 0.197433 |
| 242 | BS_XQ5SFW35 | 7316-89 | 577715 | stranded | HGAT | HGG, H3 wildtype | 1461 | Male | 0.107664 | 0.102398 | ... | 0.100223 | 0.099694 | 0.141058 | 0.088554 | 0.105310 | 0.213635 | 0.098756 | 0.099611 | 0.059582 | 0.113228 |
| 243 | BS_GS1KR5PK | 7316-1772 | 717124 | stranded | Medulloblastoma | MB, SHH | 1221 | Male | 0.209188 | 0.094796 | ... | 0.036446 | 0.036185 | 0.015965 | 0.039582 | 0.038798 | 0.153996 | 0.027754 | 0.072525 | 0.116735 | 0.214161 |
244 rows × 2058 columns
hdf = df.drop(['Kids_First_Biospecimen_ID', 'sample_id', 'aliquot_id', 'RNA_library', 'molecular_subtype', 'age_at_diagnosis_days', 'reported_gender'], axis=1)
hdf
| short_histology | MIMAT0010195 | MIMAT0004481 | MIMAT0000062 | MIMAT0000063 | MIMAT0026472 | MIMAT0000064 | MIMAT0004484 | MIMAT0000065 | MIMAT0004485 | ... | MIMAT0004986 | MIMAT0004987 | MIMAT0000094 | MIMAT0026473 | MIMAT0004510 | MIMAT0000095 | MIMAT0022842 | MIMAT0000097 | MIMAT0004678 | MIMAT0000689 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HGAT | 0.203215 | 0.170353 | 0.044772 | 0.100227 | 0.228451 | 0.044772 | 0.118220 | 0.044772 | 0.013709 | ... | 0.163825 | 0.141134 | 0.192549 | 0.153315 | 0.147092 | 0.299865 | 0.134360 | 0.044772 | 0.167571 | 0.105896 |
| 1 | Medulloblastoma | 0.183178 | 0.246241 | 0.180138 | 0.162924 | 0.157944 | 0.180138 | 0.201304 | 0.180138 | 0.155110 | ... | 0.153730 | 0.147374 | 0.322979 | 0.119636 | 0.139716 | 0.185569 | 0.173047 | 0.180138 | 0.172517 | 0.151552 |
| 2 | ATRT | 0.056398 | -0.019984 | -0.080920 | -0.121344 | -0.050051 | -0.080920 | -0.097578 | -0.080920 | 0.017420 | ... | -0.130431 | -0.140412 | -0.111823 | -0.140318 | -0.142191 | -0.041198 | -0.148094 | -0.080920 | -0.018398 | 0.060411 |
| 3 | LGAT | 0.169361 | 0.223773 | 0.166200 | 0.160803 | 0.155500 | 0.166200 | 0.206727 | 0.166200 | 0.156179 | ... | 0.145929 | 0.145370 | 0.298415 | 0.121007 | 0.138993 | 0.214281 | 0.166954 | 0.166200 | 0.155085 | 0.144268 |
| 4 | HGAT | 0.132491 | 0.170572 | -0.039174 | -0.012450 | 0.191062 | -0.039174 | 0.025425 | -0.039174 | 0.029123 | ... | 0.144303 | 0.113045 | 0.219452 | 0.115481 | 0.124098 | 0.231750 | 0.119749 | -0.039174 | 0.145273 | 0.000353 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | Ependymoma | 0.157141 | 0.125819 | 0.061975 | 0.082500 | 0.131669 | 0.061975 | 0.106920 | 0.061975 | 0.106418 | ... | 0.086446 | 0.070290 | 0.126832 | 0.068954 | 0.074025 | 0.174126 | 0.074397 | 0.061975 | 0.110112 | 0.136656 |
| 240 | Ependymoma | 0.146616 | 0.118695 | 0.066198 | 0.063461 | 0.127475 | 0.066198 | 0.096739 | 0.066198 | 0.086737 | ... | 0.045809 | 0.035370 | 0.069957 | 0.042077 | 0.038712 | 0.148805 | 0.029867 | 0.066198 | 0.084429 | 0.123514 |
| 241 | Ependymoma | 0.194594 | 0.116444 | 0.113423 | 0.120964 | 0.112428 | 0.113423 | 0.143909 | 0.113423 | 0.159605 | ... | 0.070706 | 0.075477 | 0.076592 | 0.069451 | 0.078228 | 0.165159 | 0.073962 | 0.113423 | 0.113134 | 0.197433 |
| 242 | HGAT | 0.107664 | 0.102398 | 0.099611 | 0.126305 | 0.113148 | 0.099611 | 0.160924 | 0.099611 | 0.100055 | ... | 0.100223 | 0.099694 | 0.141058 | 0.088554 | 0.105310 | 0.213635 | 0.098756 | 0.099611 | 0.059582 | 0.113228 |
| 243 | Medulloblastoma | 0.209188 | 0.094796 | 0.072525 | 0.093812 | 0.118819 | 0.072525 | 0.119084 | 0.072525 | 0.168033 | ... | 0.036446 | 0.036185 | 0.015965 | 0.039582 | 0.038798 | 0.153996 | 0.027754 | 0.072525 | 0.116735 | 0.214161 |
244 rows × 2051 columns
# Split the dataset into features and labels
X = hdf.loc[:, hdf.columns != 'short_histology'].values
y = hdf.loc[:, hdf.columns == 'short_histology'].values.ravel()
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(183, 2050) (61, 2050) (183,) (61,)
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=10, random_state=0)
# Train the random forest classifier
rfc.fit(X_train, y_train)
# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
Accuracy: 0.3770491803278688
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
labels=dict(x="Predicted Tumor Type", y="True Tumor Type", color="Productivity"),
x=df['short_histology'].unique().tolist(),
y=df['short_histology'].unique().tolist()
)
disp.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(X)
proj_3d = umap_3d.fit_transform(X)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=hdf['short_histology'].tolist(), title='PBTA Clustering based on miRNA Activity (ActMiR)')
#fig_2d = px.scatter(proj_2d, x=0, y=1, color=hdf['short_histology'].tolist(),
#facet_col=df['reported_gender'].to_list(),
#facet_row=hdf['short_histology'].tolist(),
#width=1000, height=400)
# fig_2d.update_layout(
# margin=dict(l=20, r=20, t=20, b=20),
# paper_bgcolor="LightSteelBlue",
# )
#fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hdf['short_histology'].tolist())
fig_2d.update_xaxes(title_text='UMAP1')
fig_2d.update_yaxes(title_text='UMAP2')
fig_2d.show()
#fig_3d.show()
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
meta = pd.read_csv('meta_pbta_miRNA_plus_mRNA.csv')
meta
| Kids_First_Biospecimen_ID | sample_id | aliquot_id | RNA_library | short_histology | molecular_subtype | age_at_diagnosis_days | reported_gender | |
|---|---|---|---|---|---|---|---|---|
| 0 | BS_GXTFW99H | 7316-2151 | 740006 | stranded | HGAT | HGG, H3 wildtype | 2166 | Male |
| 1 | BS_HF3RNN2T | 7316-2155 | 564491 | stranded | Medulloblastoma | MB, Group4 | 3606 | Female |
| 2 | BS_MMNRZ1EQ | 7316-2141 | 571446 | stranded | ATRT | NaN | 383 | Male |
| 3 | BS_53GQKVEX | 7316-487 | 588341 | stranded | LGAT | LGG, KIAA1549-BRAF | 553 | Male |
| 4 | BS_JS95PE0J | 7316-1774 | 574549 | stranded | HGAT | HGG, H3 wildtype, TP53 loss | 2431 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | BS_RCFQ31XF | 7316-873 | 570111 | stranded | Ependymoma | EPN, To be classified | 442 | Male |
| 240 | BS_QXKRN6CR | 7316-111 | 470423 | stranded | Ependymoma | EPN, ST RELA | 4519 | Female |
| 241 | BS_4FZS7TX4 | 7316-160 | 549587 | stranded | Ependymoma | EPN, To be classified | 1223 | Male |
| 242 | BS_XQ5SFW35 | 7316-89 | 577715 | stranded | HGAT | HGG, H3 wildtype | 1461 | Male |
| 243 | BS_GS1KR5PK | 7316-1772 | 717124 | stranded | Medulloblastoma | MB, SHH | 1221 | Male |
244 rows × 8 columns
# Export individual histology datasets
meta[meta['short_histology'] == 'Teratoma'].to_csv('teratoma_activity_meta.csv')
# Load activity scores for each tumor type
medullo = pd.read_csv('actmir_results/medulloblastoma_miRactivity_results.csv')
lgat = pd.read_csv('actmir_results/lgat_miRactivity_results.csv')
atrt = pd.read_csv('actmir_results/atrt_miRactivity_results.csv')
hgat = pd.read_csv('actmir_results/hgat_miRactivity_results.csv')
cran = pd.read_csv('actmir_results/craniopharyngioma_miRactivity_results.csv')
epend = pd.read_csv('actmir_results/ependymoma_miRactivity_results.csv')
gang = pd.read_csv('actmir_results/ganglioglioma_miRactivity_results.csv')
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(medullo.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=medullo['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=medullo['reported_gender'].tolist())
fig1.show()
fig2.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(atrt.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=atrt['reported_gender'].tolist())
fig1.show()
/Users/shehbeelarif/opt/anaconda3/lib/python3.9/site-packages/umap/umap_.py:2344: UserWarning: n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(lgat.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=lgat['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=lgat['reported_gender'].tolist())
fig1.show()
fig2.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(hgat.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=hgat['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=hgat['reported_gender'].tolist())
fig1.show()
fig2.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(cran.dropna().iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=cran.dropna()['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=cran.dropna()['reported_gender'].tolist())
fig1.show()
fig2.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(epend.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=epend['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=epend['reported_gender'].tolist())
fig1.show()
fig2.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(gang.iloc[:,8:])
fig1 = px.scatter(proj_2d, x=0, y=1, color=gang['molecular_subtype'].tolist())
fig2 = px.scatter(proj_2d, x=0, y=1, color=gang['reported_gender'].tolist())
fig1.show()
fig2.show()